This Jupyter Notebooks script performs k means clustering and data visualization on three data outputs from WeightedOverlayRasters, using them to make high-level planning recommendations for urban development and conservation prioritization.
Here are descriptions of the three datasets used in this analysis:
Bio_x_Risk emphasizes areas of high land cover change probability and high biodiversity intactness.
Bio_x_Risk = LCC_Probability x BII
Anthro_x_Risk emphasizes areas of high climate hazard probability and high human population.
Anthro_x_Risk = Climate_Hazards x pop2020
Urban Land Cover Probability is a dataset generated using a random forest model trained on a range of physiographic factors to predict the likelihood of a raster cell to be urban in 2033.
Data Formatting
Load raster datasets and remove any NoData values that might be present
Plot datasets to be clustered
Code
# List of tuples containing raster datasets and their titlesraster_data = [ (AnthroRisk, 'Anthropogenic Risk'), (BioRisk, 'Biodiversity Risk'), (UrbanProbability, 'Urban Probability'),]# Set a dark background styleplt.style.use('dark_background')# Create a figure with 3 subplots (arranged vertically)fig, axs = plt.subplots(3, 1, figsize=(40, 25)) # Adjust the figure size as needed# Loop through each raster dataset and plot in subplotsfor i, (data, title) inenumerate(raster_data): im = axs[i].imshow(data, cmap=cc.cm.bmy) # Use a specified colormap axs[i].set_title(title, color='white', fontsize=30, pad=30) # Increase the title font size and padding# Remove the labels on the axes axs[i].set_xticks([]) axs[i].set_yticks([])# Create an axis for the colorbar divider = make_axes_locatable(axs[i]) cax = divider.append_axes("right", size="5%", pad=0.5)# Add colorbar to the created axis fig.colorbar(im, cax=cax, orientation='vertical')# Adjust layout to prevent overlapplt.tight_layout()# Display the plotplt.show()
Merge the rasters into a single dataframe for clustering
Code
# Use the separate transform variable obtained from the data-loading functiontransform = AnthroRisk_transform# Ensure all rasters have the same shapeassert AnthroRisk.shape == BioRisk.shape == UrbanProbability.shape, "Rasters must be of the same size"# Get coordinatesrows, cols = np.indices(AnthroRisk.shape)xs, ys = rasterio.transform.xy(transform, rows, cols)# Dictionary of arraysarrays = {'x': np.array(xs),'y': np.array(ys),'Anthropogenic_Risk': AnthroRisk,'Biodiversity_Risk': BioRisk,'Urban_Probability': UrbanProbability}# Flatten the arraysflattened_arrays = {key: value.flatten() for key, value in arrays.items()}# Create a DataFramedf = pd.DataFrame(flattened_arrays)# Drop NaN values if necessarydf.dropna(inplace=True)# df.head()
# Number of clusters to try outn_clusters =list(range(2, 15))# Run kmeans for each value of kinertias = []for k in n_clusters:# Initialize and run kmeans = KMeans(n_clusters=k, n_init=10, max_iter=600) kmeans.fit(X)# Save the "inertia" inertias.append(kmeans.inertia_)# Plot it!# Set a dark background style for the plotplt.style.use('dark_background')# Plotting the dataplt.plot(n_clusters, inertias, marker='o', ms=10, mfc='white', lw=4, mew=3, color='blue')# Setting labels and title with light color for visibilityplt.title('Inertia vs Number of Clusters', color='white')plt.xlabel('Number of Clusters', color='white')plt.ylabel('Inertia', color='white')# Display the plotplt.show()
Determine the appropriate number of clusters using KneeLocator
Code
# Initialize the knee algorithmkn = KneeLocator(n_clusters, inertias, curve='convex', direction='decreasing')# Print out the knee print(kn.knee)
4
Fit the appropriate number of clusters to the data
Code
# Initialize the Kmeans object# Using the identified kneekmeans = KMeans(n_clusters=(kn.knee), random_state=42, n_init=10)# Using a manually-assigned knee# kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)# Run the fit! This adds the ".labels_" attributekmeans.fit(X);# Save the cluster labelsdf["cluster"] = kmeans.labels_df.head()
x
y
Anthropogenic_Risk
Biodiversity_Risk
Urban_Probability
cluster
0
-84.389663
10.129249
0.0
0.668739
0.000073
0
1
-84.389385
10.129249
0.0
0.665026
0.000071
0
2
-84.389107
10.129249
0.0
0.661313
0.000083
0
3
-84.388829
10.129249
0.0
0.655744
0.000098
0
4
-84.388552
10.129249
0.0
0.653149
0.000103
0
Code
# Count the number of rows for each cluster and sort them to see which clusters have the greatest number of rowscluster_counts = df['cluster'].value_counts().sort_values(ascending=False)# Display the countsprint('Number of Pixels per Cluster')print(cluster_counts)# print(df.dtypes)
Number of Pixels per Cluster
1 971354
0 924702
2 228555
3 112740
Name: cluster, dtype: int64
Make a multi-bar chart for each cluster
Code
# Group by 'cluster' and calculate mean for each columncluster_summary = df.groupby('cluster')[['Anthropogenic_Risk', 'Biodiversity_Risk', 'Urban_Probability']].mean()# Normalize the datacluster_summary_normalized = (cluster_summary - cluster_summary.min()) / (cluster_summary.max() - cluster_summary.min())# Set dark mode styleplt.style.use('dark_background')# Define the number of groups and bar widthn_groups =len(cluster_summary_normalized)bar_width =0.25# Define x coordinates for each groupindex = np.arange(n_groups)x1 = index - bar_widthx2 = indexx3 = index + bar_width# Plotting each column in grouped barsfig, ax = plt.subplots(figsize=(12, 6))# Titles and colors for each plotcolumns = ['Anthropogenic_Risk', 'Biodiversity_Risk', 'Urban_Probability']colors = ['#3e208d', '#a02300', 'darkgrey']plot_titles = ['Anthropogenic Risk Per Cluster', 'Biodiversity Risk per Cluster', 'Urban Probability per Cluster']# Use a for loop to create each group of barsfor x, color, column, title inzip([x1, x2, x3], colors, columns, plot_titles): ax.bar(x, cluster_summary_normalized[column], bar_width, label=title, color=color)# Add titles, labels, and legendax.set_title('Normalized Average Values per Cluster', color='white')ax.set_xlabel('Cluster', color='white')ax.set_ylabel('Normalized Average Values', color='white')ax.legend()# Set x-ticks to be in the middle of the group of barsax.set_xticks(index)ax.set_xticklabels(cluster_summary.index)# Adjust layout and show plotplt.tight_layout()plt.show()
Here we see how each one of the clusters is composed of a different combination of values from each of these three indicies. Here are some high-level takeaways about the implications of each cluster.
Findings
Land in Cluster 0 should be considered for conservation, as this means they have a high combination of land cover change probability and biodiversity.
No Anthropogenic Risk, High Biodiversity Risk, No Probability of Urban Land Cover
Land in Cluster 1 can be thought of as regional landscape assets. Their low risk level makes conservation a lower prioirty, but minimizing human impact on these lands will help maintain regional biodiversity.
No Anthropogenic Risk, Low Biodiversity Risk, No Probability of Urban Land Cover
Land in Cluster 2 is the most suitable for urban development out of the four clusters, since anthropogenic risk and biodiversity risk are relatively low there.
Low Anthropogenic Risk, Low Biodiversity Risk, High Probability of Urban Land Cover
Land in Cluster 3 should be considered for Nature-Based Solutions (NbS) implementation, such as green stormwater infrastructure, either in the form of retrofits or included in areas of new urban development.
High Anthropogenic Risk, No Biodiversity Risk, High Probability of Urban Land Cover
Visualizations
Plot the clusters across space
Code
# Set the style of the visualization with a dark backgroundsns.set(style="darkgrid")custom_palette = {0: '#cd0000',1: '#328232',2: '#A9A9A9',3: '#277bdb',}# Create a scatter plot with the custom paletteplt.figure(figsize=(10, 6))scatter = sns.scatterplot( x='x', y='y', hue='cluster', palette=custom_palette, data=df, s=0.5# Point size)# Adding title and labels with a light color for visibility against the dark backgroundplt.title('K Means Clustering', color='white')plt.xlabel('X Coordinate', color='white')plt.ylabel('Y Coordinate', color='white')# Change the color of ticks and labels for visibilityplt.xticks(color='white')plt.yticks(color='white')# Modify legend for better visibility on dark backgroundlegend = plt.legend(title='Cluster', loc='upper right', bbox_to_anchor=(1.1, 1.05), frameon=True)plt.setp(legend.get_texts(), color='white') # Set the legend text colorplt.setp(legend.get_title(), color='white') # Set the legend title color# Set the face and edge color of the figure to match the dark themeplt.gcf().set_facecolor('#303030')plt.gca().set_facecolor('#303030')plt.gca().spines['top'].set_color('white')plt.gca().spines['bottom']. set_color('white')plt.gca().spines['left'].set_color('white')plt.gca().spines['right'].set_color('white')# Display the plotplt.show()
This plot shows two types of clusters in the urban area and two types of clusters in the non-urban area.
Code
# Sample the DataFrame at 5%sampled_df = df.sample(frac=0.05)# Create a 3D scatter plot using Plotlyfig = px.scatter_3d( sampled_df, x='Anthropogenic_Risk', y='Biodiversity_Risk', z='Urban_Probability', title='3D Scatter Plot', labels={'Anthropogenic_Risk': 'Anthropogenic Risk Index','Biodiversity_Risk': 'Biodiversity Risk Index','Urban_Probability': 'Suitability of Urban Land Cover','cluster': 'Cluster' })# Update marker colors and hover information directlyfig.update_traces( marker=dict( size=1.5, color=[custom_palette[c] for c in sampled_df['cluster']], opacity=0.75 ), hovertemplate="Anthropogenic Risk: %{x}<br>Biodiversity Risk: %{y}<br>Urban Probability: %{z}<br>Cluster: %{customdata}")# Add custom data for hover informationfig.update_traces(customdata=sampled_df['cluster'])# Update the layout for dark modefig.update_layout( margin=dict(l=0, r=0, b=0, t=0), paper_bgcolor='rgb(10,10,10)', plot_bgcolor='rgb(10,10,10)', title_font_color='white', font_color='white', height=900, width=1600, scene=dict( xaxis=dict( backgroundcolor='rgb(32,32,32)', gridcolor='gray', showbackground=True, zerolinecolor='gray'), yaxis=dict( backgroundcolor='rgb(32,32,32)', gridcolor='gray', showbackground=True, zerolinecolor='gray'), zaxis=dict( backgroundcolor='rgb(32,32,32)', gridcolor='gray', showbackground=True, zerolinecolor='gray'), ))# Show the plotfig.show()
This interactive plot allows us to visualize each of the factors as it varies within the cluster. Try zooming in and rotating!